## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
##
## Loading required package: plyr
## Loading required package: stringr
Part of the idea is that for the Laplace noising to work we have to plug in a sigma (level of noising). We simulate having a very good methodology to do so by supplying dCal a large calibration set to pick sigma. In practice you don’t have such a set and would need to either know sigma from first principles or experience, or use some of your training data to build it. What we want to demonstrate is the effectiveness of the differential privacy inspired Laplace nosing technique, so we will give it a good sigma (which one may or may not have in actual practice).
print('naive effects model')
## [1] "naive effects model"
bCoder <- trainEffectCoderR(dTrain,yName,vars,0)
dTrainB <- bCoder$codeFrameR(dTrain)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
##
## Call:
## lm(formula = formulaB, data = dTrainB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.36368 -0.38461 0.00547 0.37974 1.76628
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.08056 0.01285 6.270 4.44e-10 ***
## x1 0.26619 0.02110 12.617 < 2e-16 ***
## x2 0.25160 0.03405 7.389 2.18e-13 ***
## x3 0.19060 0.05277 3.612 0.000312 ***
## x4 0.27750 0.02483 11.178 < 2e-16 ***
## x5 0.27266 0.02451 11.122 < 2e-16 ***
## x6 0.21281 0.03761 5.658 1.75e-08 ***
## x7 0.25934 0.02207 11.752 < 2e-16 ***
## x8 0.28974 0.02194 13.208 < 2e-16 ***
## x9 0.24599 0.02020 12.175 < 2e-16 ***
## x10 0.26001 0.01797 14.465 < 2e-16 ***
## n1 0.09067 0.01400 6.475 1.19e-10 ***
## n2 0.09275 0.01416 6.552 7.26e-11 ***
## n3 0.11523 0.01392 8.277 2.31e-16 ***
## n4 0.08853 0.01403 6.308 3.48e-10 ***
## n5 0.07794 0.01441 5.410 7.09e-08 ***
## n6 0.09353 0.01446 6.468 1.25e-10 ***
## n7 0.08617 0.01443 5.974 2.74e-09 ***
## n8 0.10136 0.01409 7.196 8.80e-13 ***
## n9 0.09799 0.01410 6.949 5.00e-12 ***
## n10 0.10318 0.01430 7.214 7.70e-13 ***
## n11 0.11272 0.01402 8.041 1.53e-15 ***
## n12 0.08253 0.01430 5.773 9.04e-09 ***
## n13 0.10629 0.01451 7.324 3.51e-13 ***
## n14 0.10574 0.01409 7.502 9.46e-14 ***
## n15 0.11166 0.01427 7.827 8.13e-15 ***
## n16 0.08986 0.01379 6.518 9.01e-11 ***
## n17 0.08851 0.01406 6.295 3.78e-10 ***
## n18 0.09905 0.01404 7.056 2.36e-12 ***
## n19 0.09338 0.01476 6.328 3.07e-10 ***
## n20 0.11530 0.01387 8.311 < 2e-16 ***
## n21 0.09641 0.01424 6.769 1.71e-11 ***
## n22 0.10090 0.01397 7.225 7.15e-13 ***
## n23 0.08629 0.01374 6.282 4.10e-10 ***
## n24 0.10409 0.01408 7.392 2.12e-13 ***
## n25 0.08755 0.01444 6.063 1.59e-09 ***
## n26 0.09146 0.01401 6.528 8.46e-11 ***
## n27 0.10086 0.01433 7.038 2.69e-12 ***
## n28 0.10048 0.01373 7.316 3.72e-13 ***
## n29 0.09177 0.01455 6.305 3.55e-10 ***
## n30 0.08877 0.01417 6.265 4.57e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5746 on 1959 degrees of freedom
## Multiple R-squared: 0.927, Adjusted R-squared: 0.9255
## F-statistic: 622.2 on 40 and 1959 DF, p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 0.568682746837468"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
'naive effects model train',
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## Warning: Removed 5 rows containing missing values (geom_smooth).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.140]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.80270755169949"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
'naive effects model test',
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.293]
# try re-fitting pred to see if it is just a shift/scale issue
# it is unlikely to be such, but good to look
f2 <- paste(yName,'pred',sep=' ~ ')
m2 <- lm(f2,data=dTestB) # dileberately fitting on test itself (to show it does not help)
print(summary(m2))
##
## Call:
## lm(formula = f2, data = dTestB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6308 -1.2125 -0.0242 1.1666 6.6364
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.08973 0.01789 -5.016 5.37e-07 ***
## pred 1.35606 0.02195 61.781 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.779 on 9998 degrees of freedom
## Multiple R-squared: 0.2763, Adjusted R-squared: 0.2762
## F-statistic: 3817 on 1 and 9998 DF, p-value: < 2.2e-16
dTestB$pred2 <- predict(m2,newdata=dTestB)
print(paste('adjusted test rmse',rmse(dTestB$pred2,dTestB[[yName]])))
## [1] "adjusted test rmse 1.77849696252789"
print(WVPlots::ScatterHist(dTestB,'pred2',yName,
'naive effects model test (adjusted on test)',
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.446]
print(paste('effects model, sigma=',bSigmaBest))
## [1] "effects model, sigma= 12"
bCoder <- trainEffectCoderR(dTrain,yName,vars,bSigmaBest)
dTrainB <- bCoder$codeFrameR(dTrain)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
##
## Call:
## lm(formula = formulaB, data = dTrainB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3754 -0.7183 -0.0068 0.6815 4.1328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.092236 0.024304 3.795 0.000152 ***
## x1 0.793935 0.035210 22.549 < 2e-16 ***
## x2 0.823889 0.059914 13.751 < 2e-16 ***
## x3 0.589774 0.079360 7.432 1.59e-13 ***
## x4 0.742017 0.036091 20.559 < 2e-16 ***
## x5 0.905066 0.043609 20.754 < 2e-16 ***
## x6 0.796780 0.065344 12.194 < 2e-16 ***
## x7 0.831223 0.035188 23.622 < 2e-16 ***
## x8 0.951869 0.034551 27.550 < 2e-16 ***
## x9 0.926162 0.034600 26.767 < 2e-16 ***
## x10 0.863060 0.029245 29.511 < 2e-16 ***
## n1 0.001761 0.002101 0.838 0.402095
## n2 0.003558 0.002298 1.548 0.121674
## n3 0.007559 0.002338 3.233 0.001245 **
## n4 0.003191 0.002205 1.448 0.147912
## n5 0.003709 0.002239 1.657 0.097753 .
## n6 0.001695 0.002214 0.765 0.444243
## n7 0.004327 0.002121 2.040 0.041464 *
## n8 0.005807 0.002100 2.764 0.005756 **
## n9 0.004047 0.001959 2.066 0.038970 *
## n10 0.003641 0.002226 1.636 0.102100
## n11 0.004698 0.002050 2.292 0.022021 *
## n12 0.002268 0.002100 1.080 0.280180
## n13 0.003287 0.001866 1.762 0.078252 .
## n14 0.006973 0.002009 3.472 0.000529 ***
## n15 0.005796 0.002271 2.553 0.010766 *
## n16 0.009018 0.002048 4.403 1.13e-05 ***
## n17 0.006429 0.001931 3.329 0.000888 ***
## n18 0.005286 0.002002 2.640 0.008347 **
## n19 0.005220 0.002141 2.438 0.014858 *
## n20 0.003646 0.002098 1.738 0.082383 .
## n21 0.007289 0.002162 3.371 0.000765 ***
## n22 0.005832 0.002110 2.764 0.005766 **
## n23 0.002853 0.001821 1.567 0.117355
## n24 0.005495 0.002363 2.326 0.020144 *
## n25 0.003240 0.002073 1.563 0.118235
## n26 0.007238 0.002310 3.133 0.001756 **
## n27 0.009016 0.002293 3.932 8.71e-05 ***
## n28 0.007488 0.002489 3.008 0.002662 **
## n29 0.004898 0.002357 2.078 0.037861 *
## n30 0.007319 0.002123 3.448 0.000576 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.036 on 1959 degrees of freedom
## Multiple R-squared: 0.7629, Adjusted R-squared: 0.758
## F-statistic: 157.6 on 40 and 1959 DF, p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 1.02515198435545"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
paste('effects model train, sigma=',bSigmaBest),
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.635]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.16390152646291"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
paste('effects model test, sigma=',bSigmaBest),
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.788]
print('effects model, jacknifed')
## [1] "effects model, jacknifed"
bCoder <- trainEffectCoderR(dTrain,yName,vars,0)
# dTrainB <- bCoder$codeFrame(dTrain)
# dTrainB <- bCoder$codeFrame(dCal)
dTrainB <- jackknifeEffectCodeR(dTrain,yName,vars)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
##
## Call:
## lm(formula = formulaB, data = dTrainB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0503 -0.7460 0.0110 0.6973 3.9378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.080580 0.024499 3.289 0.00102 **
## x1 0.878376 0.036480 24.078 < 2e-16 ***
## x2 0.891187 0.062232 14.320 < 2e-16 ***
## x3 0.769654 0.098820 7.788 1.09e-14 ***
## x4 0.944314 0.043679 21.619 < 2e-16 ***
## x5 0.945152 0.042897 22.033 < 2e-16 ***
## x6 0.849552 0.069355 12.249 < 2e-16 ***
## x7 0.949746 0.037648 25.227 < 2e-16 ***
## x8 1.043654 0.036410 28.664 < 2e-16 ***
## x9 0.962349 0.033361 28.847 < 2e-16 ***
## x10 0.925969 0.029086 31.836 < 2e-16 ***
## n1 0.027136 0.020443 1.327 0.18454
## n2 -0.009259 0.020976 -0.441 0.65896
## n3 0.031421 0.020508 1.532 0.12565
## n4 -0.030794 0.020589 -1.496 0.13491
## n5 -0.020427 0.020843 -0.980 0.32720
## n6 0.009093 0.021514 0.423 0.67261
## n7 0.013300 0.021150 0.629 0.52953
## n8 0.008724 0.020401 0.428 0.66897
## n9 -0.001165 0.020370 -0.057 0.95440
## n10 -0.001888 0.021453 -0.088 0.92990
## n11 0.012744 0.020469 0.623 0.53364
## n12 -0.001472 0.020607 -0.071 0.94306
## n13 -0.034807 0.020765 -1.676 0.09385 .
## n14 0.019618 0.020409 0.961 0.33655
## n15 0.027026 0.020765 1.302 0.19323
## n16 -0.002004 0.019604 -0.102 0.91859
## n17 0.003144 0.020437 0.154 0.87777
## n18 0.008286 0.020588 0.402 0.68738
## n19 -0.040983 0.021189 -1.934 0.05324 .
## n20 -0.012028 0.020187 -0.596 0.55136
## n21 -0.024144 0.021062 -1.146 0.25179
## n22 0.016472 0.020233 0.814 0.41567
## n23 -0.012645 0.020121 -0.628 0.52976
## n24 0.020160 0.021181 0.952 0.34132
## n25 -0.049179 0.020551 -2.393 0.01680 *
## n26 0.021604 0.020720 1.043 0.29723
## n27 -0.022748 0.021124 -1.077 0.28166
## n28 -0.008041 0.020318 -0.396 0.69231
## n29 -0.036017 0.021344 -1.687 0.09167 .
## n30 0.009637 0.020581 0.468 0.63968
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.095 on 1959 degrees of freedom
## Multiple R-squared: 0.7349, Adjusted R-squared: 0.7295
## F-statistic: 135.8 on 40 and 1959 DF, p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 1.08397286675349"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
'effects model train, jackknifed',
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.965]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.0755272623194"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
'effects model test, jackknifed',
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1118]
print("vtreat split model")
## [1] "vtreat split model"
pruneSig = 0.05
print("working vtreat split model")
## [1] "working vtreat split model"
mTitle <- 'vtreat split model'
isTrain <- runif(nrow(dTrain))<=0.5
dTrainDT <- dTrain[isTrain,]
dTrainDC <- dTrain[!isTrain,]
treatments <- vtreat::designTreatmentsN(dTrainDC,vars,yName,
rareSig=0.3,
smFactor=5.0,
minFraction=2.0,
verbose=FALSE,
parallelCluster=cl)
dTrainV <- vtreat::prepare(treatments,dTrainDT,pruneSig=pruneSig,
parallelCluster=cl)
#print(treatments$scoreFrame)
varsV <- intersect(colnames(dTrainV),
treatments$scoreFrame$varName[treatments$scoreFrame$sig<pruneSig])
print(varsV)
## [1] "x1_catP" "x1_catN" "x2_catN" "x4_catN" "x4_catD" "x5_catN"
## [7] "x5_catD" "x6_catN" "x7_catN" "x7_catD" "x8_catN" "x9_catN"
## [13] "x9_catD" "x10_catP" "x10_catN" "x10_catD" "n20_catN"
dTestV <- vtreat::prepare(treatments,dTest,pruneSig=pruneSig,
varRestriction=varsV,
parallelCluster=cl)
formulaV <- paste(yName,paste(varsV,collapse=' + '),sep=' ~ ')
modelV <- lm(formulaV,data=dTrainV)
dTrainV$pred <- predict(modelV,newdata=dTrainV)
print(paste('train rmse',rmse(dTrainV$pred,dTrainV[[yName]])))
## [1] "train rmse 1.14302240429833"
print(WVPlots::ScatterHist(dTrainV,'pred',yName,
paste(mTitle,'train'),
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1295]
dTestV$pred <- predict(modelV,newdata=dTestV)
print(paste('test rmse',rmse(dTestV$pred,dTestV[[yName]])))
## [1] "test rmse 1.14827374668532"
print(WVPlots::ScatterHist(dTestV,'pred',yName,
paste(mTitle,'test'),
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1448]
print("vtreat cross model")
## [1] "vtreat cross model"
pruneSig = 0.05
if("mkCrossFrameNExperiment" %in% ls('package:vtreat')) {
print("working vtreat cross model")
mTitle <- 'vtreat cross model'
crossD <- vtreat::mkCrossFrameNExperiment(dTrain,vars,yName,
rareSig=0.3,
smFactor=5.0,
minFraction=2.0,
parallelCluster=cl)
treatments <- crossD$treatments
dTrainV <- crossD$crossFrame
# print(treatments$scoreFrame)
varsV <- intersect(colnames(dTrainV),
treatments$scoreFrame$varName[treatments$scoreFrame$sig<pruneSig])
print(varsV)
dTestV <- vtreat::prepare(treatments,dTest,pruneSig=pruneSig,
varRestriction=varsV,
parallelCluster=cl)
formulaV <- paste(yName,paste(varsV,collapse=' + '),sep=' ~ ')
modelV <- lm(formulaV,data=dTrainV)
print(summary(modelV))
dTrainV$pred <- predict(modelV,newdata=dTrainV)
print(paste('train rmse',rmse(dTrainV$pred,dTrainV[[yName]])))
print(WVPlots::ScatterHist(dTrainV,'pred',yName,
paste(mTitle,'train'),
smoothmethod='lm',annot_size=2))
dTestV$pred <- predict(modelV,newdata=dTestV)
print(paste('test rmse',rmse(dTestV$pred,dTestV[[yName]])))
print(WVPlots::ScatterHist(dTestV,'pred',yName,
paste(mTitle,'test'),
smoothmethod='lm',annot_size=2))
} else {
print("cross model not in library")
}
## [1] "working vtreat cross model"
## [1] "x1_catN" "x2_catN" "x2_catD" "x3_catN" "x4_catP" "x4_catN"
## [7] "x5_catN" "x6_catP" "x6_catN" "x7_catN" "x7_catD" "x8_catN"
## [13] "x8_catD" "x9_catN" "x10_catP" "x10_catN" "n3_catD"
##
## Call:
## lm(formula = formulaV, data = dTrainV)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2362 -0.7503 -0.0042 0.7292 3.8354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.08983 0.89523 -2.334 0.019674 *
## x1_catN 0.90032 0.03766 23.905 < 2e-16 ***
## x2_catN 0.90069 0.06542 13.767 < 2e-16 ***
## x2_catD 0.46364 0.20138 2.302 0.021419 *
## x3_catN 0.83154 0.09666 8.603 < 2e-16 ***
## x4_catP -5.25998 2.72649 -1.929 0.053847 .
## x4_catN 0.96990 0.04712 20.584 < 2e-16 ***
## x5_catN 0.96030 0.04440 21.630 < 2e-16 ***
## x6_catP 8.08470 3.03272 2.666 0.007742 **
## x6_catN 0.98357 0.07655 12.848 < 2e-16 ***
## x7_catN 0.92333 0.03931 23.491 < 2e-16 ***
## x7_catD 0.64209 0.18859 3.405 0.000676 ***
## x8_catN 1.02752 0.03826 26.853 < 2e-16 ***
## x8_catD -0.40701 0.22186 -1.834 0.066732 .
## x9_catN 0.98006 0.03461 28.317 < 2e-16 ***
## x10_catP 5.15991 4.07040 1.268 0.205066
## x10_catN 0.94034 0.03306 28.446 < 2e-16 ***
## n3_catD -0.02438 0.01618 -1.507 0.132033
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.107 on 1982 degrees of freedom
## Multiple R-squared: 0.7262, Adjusted R-squared: 0.7239
## F-statistic: 309.3 on 17 and 1982 DF, p-value: < 2.2e-16
##
## [1] "train rmse 1.101539755982"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1625]
## [1] "test rmse 1.05281991330366"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1778]
sSigma = 100
print(paste('effects model, smoothing=',sSigma))
## [1] "effects model, smoothing= 100"
bCoder <- trainEffectCoderLSmooth(dTrain,yName,vars,sSigma)
dTrainB <- bCoder$codeFrameR(dTrain)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
##
## Call:
## lm(formula = formulaB, data = dTrainB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0123 -0.4071 0.0243 0.4038 2.0010
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03157 0.01411 2.238 0.0253 *
## x1 0.46900 0.03433 13.661 < 2e-16 ***
## x2 0.43635 0.05605 7.785 1.12e-14 ***
## x3 0.35879 0.08629 4.158 3.35e-05 ***
## x4 0.48251 0.04054 11.902 < 2e-16 ***
## x5 0.49399 0.03994 12.369 < 2e-16 ***
## x6 0.36647 0.06119 5.989 2.50e-09 ***
## x7 0.45372 0.03566 12.725 < 2e-16 ***
## x8 0.51743 0.03583 14.443 < 2e-16 ***
## x9 0.44220 0.03236 13.667 < 2e-16 ***
## x10 0.46038 0.02866 16.066 < 2e-16 ***
## n1 2.50847 0.35889 6.990 3.77e-12 ***
## n2 2.48761 0.36780 6.764 1.77e-11 ***
## n3 3.15687 0.35059 9.004 < 2e-16 ***
## n4 2.56514 0.36398 7.048 2.51e-12 ***
## n5 1.88568 0.36896 5.111 3.52e-07 ***
## n6 2.19744 0.37914 5.796 7.90e-09 ***
## n7 2.08384 0.35166 5.926 3.66e-09 ***
## n8 2.36660 0.34844 6.792 1.46e-11 ***
## n9 2.46809 0.35716 6.910 6.51e-12 ***
## n10 2.20245 0.35444 6.214 6.30e-10 ***
## n11 2.68048 0.34446 7.782 1.15e-14 ***
## n12 1.99004 0.37429 5.317 1.18e-07 ***
## n13 2.61391 0.37424 6.984 3.90e-12 ***
## n14 2.43482 0.35298 6.898 7.09e-12 ***
## n15 2.57561 0.36112 7.132 1.38e-12 ***
## n16 2.43798 0.34197 7.129 1.41e-12 ***
## n17 2.25436 0.34132 6.605 5.11e-11 ***
## n18 2.52469 0.35400 7.132 1.39e-12 ***
## n19 2.50887 0.37768 6.643 3.97e-11 ***
## n20 2.24130 0.34401 6.515 9.21e-11 ***
## n21 2.25363 0.36182 6.229 5.75e-10 ***
## n22 2.29114 0.33994 6.740 2.08e-11 ***
## n23 1.94294 0.33639 5.776 8.88e-09 ***
## n24 2.42563 0.35719 6.791 1.47e-11 ***
## n25 1.98204 0.35585 5.570 2.90e-08 ***
## n26 2.16032 0.33883 6.376 2.26e-10 ***
## n27 2.35022 0.37638 6.244 5.21e-10 ***
## n28 2.07056 0.34846 5.942 3.32e-09 ***
## n29 2.03861 0.35614 5.724 1.20e-08 ***
## n30 2.27701 0.34654 6.571 6.40e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6266 on 1959 degrees of freedom
## Multiple R-squared: 0.9132, Adjusted R-squared: 0.9115
## F-statistic: 515.5 on 40 and 1959 DF, p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 0.620113653647223"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
paste('effects model train, smoothing=',sSigma),
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1955]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.6938494493661"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
paste('effects model test, smoothing=',sSigma),
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.2108]
if(!is.null(cl)) {
parallel::stopCluster(cl)
cl <- NULL
}